# Creates C data structures for binary lookup table of entities,
# using python's html5 entity data.
# Usage: python3 tools/make_entities_inc.py > src/entities.inc

import html

entities5 = html.entities.html5

# Remove keys without semicolons. HTML5 allows some named character
# references without a trailing semicolon.
entities = sorted([(k[:-1], entities5[k]) for k in entities5.keys() if k[-1] == ';'])

main_table = []
text_table = b''
text_idx = 0

for (ent, repl) in entities:
    ent_bytes = ent.encode('UTF-8')
    ent_size = len(ent_bytes)
    repl_bytes = repl.encode('UTF-8')
    repl_size = len(repl_bytes)

    if text_idx >= (1 << 15):
        raise Exception("text index too large")
    if ent_size >= (1 << 5):
        raise Exception("entity name too long")
    if repl_size >= (1 << 3):
        raise Exception("entity replacement too long")

    main_table += [ text_idx | ent_size << 15 | repl_size << 20 ]

    text_table += ent_bytes + repl_bytes
    text_idx += ent_size + repl_size

print("""/* Autogenerated by tools/make_headers_inc.py */

#define ENT_MIN_LENGTH      2
#define ENT_MAX_LENGTH      32
#define ENT_TABLE_SIZE      %d
#define ENT_TEXT_IDX(x)     ((x) & 0x7FFF)
#define ENT_NAME_SIZE(x)    (((x) >> 15) & 0x1F)
#define ENT_REPL_SIZE(x)    ((x) >> 20)

static const uint32_t cmark_entities[%d] = {""" % (len(main_table), len(main_table)));

i = 0
size = len(main_table)
for value in main_table:
    if i % 6 == 0:
        print("  ", end="")
    print("0x%X" % value, end="")
    i += 1
    if i == size: print()
    elif i % 6 == 0: print(",")
    else: print(", ", end="")

print("""};

static const unsigned char cmark_entity_text[%d] = {""" % len(text_table))

i = 0
size = len(text_table)
for value in text_table:
    if i % 12 == 0:
        print("  ", end="")
    print("0x%02X" % value, end="")
    i += 1
    if i == size: print()
    elif i % 12 == 0: print(",")
    else: print(", ", end="")

print("};")
